In [ ]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
import time
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn import metrics
import shap
import lime
import lime.lime_tabular
import pickle
import warnings
warnings.filterwarnings('ignore')
shap.initjs()
In [ ]:
data_raw = pd.read_stata('MIMIC-SAD_dta_files/MIMIC-IV.dta')
data = pd.read_stata('MIMIC-SAD_dta_files/MIMIC-IV.dta')
dummies = pd.get_dummies(data['race'])
data = data.drop('race',axis=1).join(dummies)
dummies = pd.get_dummies(data['first_careunit'])
data = data.drop('first_careunit',axis=1).join(dummies)
In [ ]:
x=3
data.iloc[:,0+x*18:18+x*18]
data
Out[ ]:
| stay_id | age | weight | gender | temperature | heart_rate | resp_rate | spo2 | sbp | dbp | ... | OTHER | WHITE | unknown | CCU | CVICU | MICU | MICU/SICU | NICU | SICU | TSICU | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 30000646 | 44.0 | 79.000000 | 0 | 37.000000 | 100.0 | 28.0 | 98.0 | 107.0 | 66.0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 30001446 | 56.0 | 119.300003 | 0 | 36.720001 | 82.0 | 22.0 | 90.0 | 75.0 | 56.0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2 | 30002415 | 73.0 | 83.500000 | 1 | 36.439999 | 71.0 | 16.0 | 100.0 | 117.0 | 67.0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 3 | 30003226 | 67.0 | 93.449997 | 0 | 37.220001 | 89.0 | 17.0 | 98.0 | 111.0 | 63.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4 | 30004242 | 76.0 | 77.599998 | 1 | 36.720001 | 59.0 | 21.0 | 97.0 | 107.0 | 90.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14615 | 39993425 | 93.0 | 47.799999 | 1 | 35.830002 | 115.0 | 16.0 | 99.0 | 97.0 | 71.0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 14616 | 39993476 | 67.0 | 93.000000 | 0 | 36.439999 | 81.0 | 16.0 | 100.0 | 112.0 | 60.0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 14617 | 39993968 | 91.0 | 57.500000 | 1 | 35.830002 | 43.0 | 17.0 | 100.0 | 78.0 | 39.0 | ... | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 14618 | 39996044 | 59.0 | 66.400002 | 0 | 36.389999 | 105.0 | 23.0 | 100.0 | 107.0 | 63.0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 14619 | 39999301 | 78.0 | 107.699997 | 0 | 36.610001 | 58.0 | 15.0 | 96.0 | 108.0 | 62.0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
14620 rows × 61 columns
In [ ]:
data2 = data.drop(['deliriumtime', 'hosp_mort', 'icu28dmort', 'stay_id', 'icustay', 'hospstay', 'sepsistime'], axis=1)
In [ ]:
# Compute the correlation matrix
corr = data_raw.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 16))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
Out[ ]:
<Axes: >
In [ ]:
x=2
data2.iloc[:,0+x*18:18+x*18]
data2
Out[ ]:
| age | weight | gender | temperature | heart_rate | resp_rate | spo2 | sbp | dbp | mbp | ... | OTHER | WHITE | unknown | CCU | CVICU | MICU | MICU/SICU | NICU | SICU | TSICU | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 44.0 | 79.000000 | 0 | 37.000000 | 100.0 | 28.0 | 98.0 | 107.0 | 66.0 | 75.0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 56.0 | 119.300003 | 0 | 36.720001 | 82.0 | 22.0 | 90.0 | 75.0 | 56.0 | 61.0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2 | 73.0 | 83.500000 | 1 | 36.439999 | 71.0 | 16.0 | 100.0 | 117.0 | 67.0 | 87.0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 3 | 67.0 | 93.449997 | 0 | 37.220001 | 89.0 | 17.0 | 98.0 | 111.0 | 63.0 | 71.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4 | 76.0 | 77.599998 | 1 | 36.720001 | 59.0 | 21.0 | 97.0 | 107.0 | 90.0 | 94.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14615 | 93.0 | 47.799999 | 1 | 35.830002 | 115.0 | 16.0 | 99.0 | 97.0 | 71.0 | 80.0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 14616 | 67.0 | 93.000000 | 0 | 36.439999 | 81.0 | 16.0 | 100.0 | 112.0 | 60.0 | 78.0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 14617 | 91.0 | 57.500000 | 1 | 35.830002 | 43.0 | 17.0 | 100.0 | 78.0 | 39.0 | 49.0 | ... | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 14618 | 59.0 | 66.400002 | 0 | 36.389999 | 105.0 | 23.0 | 100.0 | 107.0 | 63.0 | 80.0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 14619 | 78.0 | 107.699997 | 0 | 36.610001 | 58.0 | 15.0 | 96.0 | 108.0 | 62.0 | 73.0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
14620 rows × 54 columns
In [ ]:
np.random.seed(123)
index = np.random.rand(len(data2)) < 0.7
train = data2[index]
test = data2[~index]
In [ ]:
test.to_stata('test_set.dta')
In [ ]:
train_raw = data_raw[index].to_stata('train_set_raw')
In [ ]:
print(index)
[ True True True ... True True True]
In [ ]:
train.to_stata('train_set')
In [ ]:
train_matrix = xgb.DMatrix(train.loc[:, ~train.columns.isin(["sad"])], label=train["sad"])
test_matrix = xgb.DMatrix(test.loc[:, ~test.columns.isin(["sad"])], label=test["sad"])
In [ ]:
params = {
"objective": "binary:logistic",
"eval_metric": "logloss",
"max_depth": 3,
"eta": 0.1,
"gamma": 0.5,
"colsample_bytree": 1,
"min_child_weight": 1,
"subsample": 0.5,
"seed": 123
}
watchlist = [(train_matrix, "train"), (test_matrix, "val")]
xgb_model = xgb.train(params=params, dtrain=train_matrix, num_boost_round=125, evals=watchlist,
early_stopping_rounds=10, verbose_eval=10)
[0] train-logloss:0.63979 val-logloss:0.64264 [10] train-logloss:0.56640 val-logloss:0.57160 [20] train-logloss:0.54387 val-logloss:0.55215 [30] train-logloss:0.53063 val-logloss:0.54359 [40] train-logloss:0.52102 val-logloss:0.53830 [50] train-logloss:0.51373 val-logloss:0.53539 [60] train-logloss:0.50700 val-logloss:0.53184 [70] train-logloss:0.50175 val-logloss:0.53095 [80] train-logloss:0.49681 val-logloss:0.53020 [90] train-logloss:0.49229 val-logloss:0.52977 [100] train-logloss:0.48864 val-logloss:0.53065
In [ ]:
print(type(xgb_model))
<class 'xgboost.core.Booster'>
In [ ]:
pickle.dump(xgb_model, open("xgb.pkl", "wb"))
In [ ]:
xgb_pred_prob = xgb_model.predict(test_matrix)
xgb_pred = np.where(xgb_pred_prob > 0.5, 1, 0)
xgb_pred_factor = pd.factorize(xgb_pred)[0]
test_sad_factor = pd.factorize(test["sad"])[0]
confusion_matrix = metrics.confusion_matrix(xgb_pred_factor, test_sad_factor)
print(confusion_matrix)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])
cm_display.plot()
plt.show()
[[2329 721] [ 407 904]]
In [ ]:
xgb_pred_prob
Out[ ]:
array([0.18503565, 0.6261598 , 0.1572482 , ..., 0.7053726 , 0.32250598,
0.18439463], dtype=float32)
In [ ]:
test[:1]
Out[ ]:
| age | weight | gender | temperature | heart_rate | resp_rate | spo2 | sbp | dbp | mbp | ... | OTHER | WHITE | unknown | CCU | CVICU | MICU | MICU/SICU | NICU | SICU | TSICU | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | 76.0 | 77.599998 | 1 | 36.720001 | 59.0 | 21.0 | 97.0 | 107.0 | 90.0 | 94.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 rows × 54 columns
In [ ]:
test.iloc[:,41]
Out[ ]:
4 0
6 0
11 0
15 0
21 0
..
14597 0
14601 0
14606 0
14609 0
14612 0
Name: AISAN, Length: 4361, dtype: uint8
In [ ]:
xgb_model.predict(xgb.DMatrix(test.drop('sad',axis=1)[:1]))
Out[ ]:
array([0.18503565], dtype=float32)
In [ ]:
In [ ]:
xgb.plot_importance(xgb_model, importance_type='gain', max_num_features=30)
plt.title('feature importance: gain')
plt.show()
xgb.plot_importance(xgb_model, max_num_features=30)
plt.title('feature importance: weight')
plt.show()
xgb.plot_importance(xgb_model, importance_type='cover', max_num_features=30)
plt.title('feature importance: cover')
plt.show()
In [ ]:
data2.loc[:, ~data2.columns.isin(["sad"])]
Out[ ]:
| age | weight | gender | temperature | heart_rate | resp_rate | spo2 | sbp | dbp | mbp | ... | OTHER | WHITE | unknown | CCU | CVICU | MICU | MICU/SICU | NICU | SICU | TSICU | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 44.0 | 79.000000 | 0 | 37.000000 | 100.0 | 28.0 | 98.0 | 107.0 | 66.0 | 75.0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 56.0 | 119.300003 | 0 | 36.720001 | 82.0 | 22.0 | 90.0 | 75.0 | 56.0 | 61.0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2 | 73.0 | 83.500000 | 1 | 36.439999 | 71.0 | 16.0 | 100.0 | 117.0 | 67.0 | 87.0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 3 | 67.0 | 93.449997 | 0 | 37.220001 | 89.0 | 17.0 | 98.0 | 111.0 | 63.0 | 71.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4 | 76.0 | 77.599998 | 1 | 36.720001 | 59.0 | 21.0 | 97.0 | 107.0 | 90.0 | 94.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14615 | 93.0 | 47.799999 | 1 | 35.830002 | 115.0 | 16.0 | 99.0 | 97.0 | 71.0 | 80.0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 14616 | 67.0 | 93.000000 | 0 | 36.439999 | 81.0 | 16.0 | 100.0 | 112.0 | 60.0 | 78.0 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 14617 | 91.0 | 57.500000 | 1 | 35.830002 | 43.0 | 17.0 | 100.0 | 78.0 | 39.0 | 49.0 | ... | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 14618 | 59.0 | 66.400002 | 0 | 36.389999 | 105.0 | 23.0 | 100.0 | 107.0 | 63.0 | 80.0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 14619 | 78.0 | 107.699997 | 0 | 36.610001 | 58.0 | 15.0 | 96.0 | 108.0 | 62.0 | 73.0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
14620 rows × 53 columns
In [ ]:
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(data2.loc[:, ~data2.columns.isin(["sad"])])
In [ ]:
#dit kan in scherm individuele patient
shap.force_plot(explainer.expected_value, shap_values[0, :], data2.loc[:, ~data2.columns.isin(["sad"])].iloc[0, :])
Out[ ]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [ ]:
# dit is niet zo nuttig maar wel gewoon heel vet
shap.force_plot(
explainer.expected_value, shap_values[:1000, :], data2.loc[:, ~data2.columns.isin(["sad"])].iloc[:1000, :]
)
Out[ ]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [ ]:
# dit kan handig zijn voor globale uitleg over model, even kijken of dit wenselijker is dan feature importance gain van XGB
shap.summary_plot(shap_values, data2.loc[:, ~data2.columns.isin(["sad"])], plot_type="bar")
In [ ]:
# wel mooi deze maar een beetje convoluted
shap.summary_plot(shap_values, data2.loc[:, ~data2.columns.isin(["sad"])])
LIME¶
In [ ]:
X = data2.loc[:, ~data2.columns.isin(["sad"])]
X_train = train.loc[:, ~train.columns.isin(["sad"])]
explainer = lime.lime_tabular.LimeTabularExplainer(X_train, feature_names=X.columns, mode='classification')
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) File ~\anaconda3\envs\INNO\lib\site-packages\pandas\core\indexes\base.py:3802, in Index.get_loc(self, key, method, tolerance) 3801 try: -> 3802 return self._engine.get_loc(casted_key) 3803 except KeyError as err: File ~\anaconda3\envs\INNO\lib\site-packages\pandas\_libs\index.pyx:138, in pandas._libs.index.IndexEngine.get_loc() File ~\anaconda3\envs\INNO\lib\site-packages\pandas\_libs\index.pyx:144, in pandas._libs.index.IndexEngine.get_loc() TypeError: '(slice(None, None, None), 0)' is an invalid key During handling of the above exception, another exception occurred: InvalidIndexError Traceback (most recent call last) Cell In[24], line 3 1 X = data2.loc[:, ~data2.columns.isin(["sad"])] 2 X_train = train.loc[:, ~train.columns.isin(["sad"])] ----> 3 explainer = lime.lime_tabular.LimeTabularExplainer(X_train, feature_names=X.columns, mode='classification') File ~\anaconda3\envs\INNO\lib\site-packages\lime\lime_tabular.py:215, in LimeTabularExplainer.__init__(self, training_data, mode, training_labels, feature_names, categorical_features, categorical_names, kernel_width, kernel, verbose, class_names, feature_selection, discretize_continuous, discretizer, sample_around_instance, random_state, training_data_stats) 209 discretizer = StatsDiscretizer(training_data, self.categorical_features, 210 self.feature_names, labels=training_labels, 211 data_stats=self.training_data_stats, 212 random_state=self.random_state) 214 if discretizer == 'quartile': --> 215 self.discretizer = QuartileDiscretizer( 216 training_data, self.categorical_features, 217 self.feature_names, labels=training_labels, 218 random_state=self.random_state) 219 elif discretizer == 'decile': 220 self.discretizer = DecileDiscretizer( 221 training_data, self.categorical_features, 222 self.feature_names, labels=training_labels, 223 random_state=self.random_state) File ~\anaconda3\envs\INNO\lib\site-packages\lime\discretize.py:178, in QuartileDiscretizer.__init__(self, data, categorical_features, feature_names, labels, random_state) 176 def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): --> 178 BaseDiscretizer.__init__(self, data, categorical_features, 179 feature_names, labels=labels, 180 random_state=random_state) File ~\anaconda3\envs\INNO\lib\site-packages\lime\discretize.py:51, in BaseDiscretizer.__init__(self, data, categorical_features, feature_names, labels, random_state, data_stats) 48 self.random_state = check_random_state(random_state) 50 # To override when implementing a custom binning ---> 51 bins = self.bins(data, labels) 52 bins = [np.unique(x) for x in bins] 54 # Read the stats from data_stats if exists File ~\anaconda3\envs\INNO\lib\site-packages\lime\discretize.py:185, in QuartileDiscretizer.bins(self, data, labels) 183 bins = [] 184 for feature in self.to_discretize: --> 185 qts = np.array(np.percentile(data[:, feature], [25, 50, 75])) 186 bins.append(qts) 187 return bins File ~\anaconda3\envs\INNO\lib\site-packages\pandas\core\frame.py:3807, in DataFrame.__getitem__(self, key) 3805 if self.columns.nlevels > 1: 3806 return self._getitem_multilevel(key) -> 3807 indexer = self.columns.get_loc(key) 3808 if is_integer(indexer): 3809 indexer = [indexer] File ~\anaconda3\envs\INNO\lib\site-packages\pandas\core\indexes\base.py:3809, in Index.get_loc(self, key, method, tolerance) 3804 raise KeyError(key) from err 3805 except TypeError: 3806 # If we have a listlike key, _check_indexing_error will raise 3807 # InvalidIndexError. Otherwise we fall through and re-raise 3808 # the TypeError. -> 3809 self._check_indexing_error(key) 3810 raise 3812 # GH#42269 File ~\anaconda3\envs\INNO\lib\site-packages\pandas\core\indexes\base.py:5925, in Index._check_indexing_error(self, key) 5921 def _check_indexing_error(self, key): 5922 if not is_scalar(key): 5923 # if key is not a scalar, directly raise an error (the code below 5924 # would convert to numpy arrays and raise later any way) - GH29926 -> 5925 raise InvalidIndexError(key) InvalidIndexError: (slice(None, None, None), 0)
In [ ]:
params = {
"objective": "binary:logistic",
"eval_metric": "logloss",
"max_depth": 3,
"eta": 0.1,
"gamma": 0.5,
"colsample_bytree": 1,
"min_child_weight": 1,
"subsample": 0.5,
"seed": 123
}
watchlist = [(train_matrix, "train"), (test_matrix, "val")]
xgb_model = xgb.train(params=params, dtrain=train_matrix, num_boost_round=125, evals=watchlist,
early_stopping_rounds=10, verbose_eval=10)
In [ ]: